\documentclass[11pt]{article}
\usepackage{amssymb, amsthm, amsmath}
\usepackage{bm}
\usepackage{graphicx}
\usepackage[authoryear]{natbib}
\usepackage{bm}
\usepackage{verbatim}
\usepackage{lineno}
\usepackage{times}
\usepackage{soul}
\usepackage{color}

\usepackage[left=1in,top=1in,right=1in]{geometry}
\pdfpageheight 11in
\pdfpagewidth 8.5in
\linespread{1.0}
\input{mycommands.sty}


\begin{document}\linenumbers

\begin{center}
{\Large {\bf A new spatial model for points above a threshold}}\\
\today
\end{center}

\section{Introduction}\label{s:intro}

\section{Statistical model}\label{s:model}

Our approach is to take the construction of \cite{schlather2002} for a max-stable model for block maxima as a literal model for extreme daily data.  We will show that his has nice theoretical and computational properties on the daily scale, and gives the popular \cite{schlather2002} as the limit of the block maximum data.

Let $Y_t(\bs)\in \calR$ be the observed value at location $\bs$ on day $t$.  To avoid bias in estimating tail parameters, we model the thresholded data
\beq\label{Yt}
  \Yt_t(\bs) = \left\{
          \begin{array}{ll}
            Y_t(\bs) & Y_t(\bs)>T \\
            T & Y_t(\bs)\le T
          \end{array}
        \right.
\eeq
where $T$ is a pre-specified threshold.   

We first specify a model for the complete data, $Y_t(\bs)$, and then study the induced model for thresholded data, $\Yt_t(\bs)$.  
The full data model is given in Section \ref{s:model1} assuming a multivariate normal distribution with a different variance each day.
Computationally, the values below the threshold are updated using standard Bayesian missing data methods as described in Section \ref{s:comp}.

\subsection{Complete data}\label{s:model}
Consider the model
\begin{align} \label{fullmodel}
  Y_t(\bs) &= X_t(\bs) \beta + e_t(\bs)\\
  e_t(\bs) &= \sigma \delta | u_t(\bs) | + v_t(\bs)
\end{align}
where $u_t(\bs)$ has marginal $N(0, 1)$ distribution, $\delta \in (-1, 1)$ controls skew, and $v_t(\bs)$ is a spatial process with mean zero and variance $\sigma^2(1 - \delta^2)$.


% \subsection{Complete data}\label{s:model1}
% Consider a zero-mean Gaussian process $\epsilon_t(\bs)$ with $\Matern$ correlation structure.  
% Consider the model
% \beq\label{fullmodel}
%   Y_t(\bs) = \mu(\bs) + r_t W_t(\bs).
% \eeq
% where $r_t$ are daily random effects and $W_t$ are spatial processes with mean E$[W_t(\bs)]=0$, variance V$[W_t(\bs)]=1$, and $\Matern$ spatial correlation Cor$[W_t(\bs_1),W_t(\bs_2)]=M(||\bs_1-\bs_2||;\rho,\nu)$ with spatial range $\rho>0$, smoothness $\nu>0$.
% Let $\alpha$ be the proportion of the total variance, $\sigma^2_W$, that is accounted for by the partial sill of the spatial process.
% Then we denote the spatial $\Matern$ process model for $W_t$ as $W_t\sim \mbox{Mat}(0,1,\rho,\nu, \alpha)$.
% So for a single day, 
% \beq \label{eq:singleday}
% Y_t(\bs)|r_t \sim \mbox{Mat}[\mu(\bs), r_t, \rho, \nu, \alpha].
% \eeq
% We consider two different random effects distributions, $r_t \iid \mbox{GPD}(0, \sigma_r, \xi_r)$ and $r_t \iid \mbox{Inv. Gamma}(\xi_r, \sigma_r)$.



%\subsubsection{Special case}\label{s.powerlaw}
%Consider a special case where $r_t \iid GPD(\sigma_r/\xi_r, \sigma_r, \xi_r)$. Then $r_t$ follows a power law distribution. 
%The algebra works out and we can use an IG kernel to marginalize out the $r_t$ terms. 
%Similar results come from letting $r_t \iid GEV(\sigma_r/\xi_r, \sigma_r, \xi_r)$.
%Otherwise, I don't really have a good place to start for marginalizing over the random effect term. 
%After marginalizing out $r_t$ in this special case, we are left with (I'll add in the verification to the appendix)
%\beq
%f_Z(z) = \left[ \frac{ 1 }{ 2\sigma^2_W }(\bZ^T \Sigma^{-1} \bZ)\right]^{-\frac{ n }{ 2}} \Gamma\left(\frac{ n }{ 2 } \right) \left[ \frac{\xi_r}{\sigma_r} \right]^{-1/\xi_r - 1}  (2\pi)^{ -n/2} \sigma_W^{-n} \left| \Sigma \right|^{-1}\exp\left\{ -\frac{ 1 }{ 2 \sigma^2_W } \left(\bZ^T \Sigma^{-1} \bZ\right) \right\}
%\eeq
%which looks strangely like a GEV distribution.
%In fact, marginally at each site, I'm pretty sure this will follow a univariate GEV distribution.
%Also, to figure out the bivariate joint density from this distribution shouldn't all that challenging either.
%It seems like there's a relationship between the GEV and the Generalized Gamma Distribution, but as of right now, I can only get them to match up when we have the relationship that $\mu_r = \frac{ \xi_r }{ \sigma_r }$.
%The pdf for the Generalized Gamma Distribution is
%\beq \label{eq:ggamma}
% \frac{ p / a^d }{ \Gamma(d/p) } x^{ d-1 }\exp\{ -(x/a)^p \}
%\eeq
%and I also found a four-parameter version that gives a shift parameter with density
%\beq
% \frac{ \theta^\alpha c }{\Gamma(\alpha) } (x - \gamma)^{ c \alpha - 1} \exp\{ -\theta(x - \gamma)^c \}
%\eeq
%which is equivalent to (\ref{eq:ggamma}) where $\alpha = d/p$, $\theta=(1/a)^p$ and $c=p$
%Do you think I should continue pursuing this, or start focusing on modifying the MCMC now? 

\subsubsection{GPD random effect}
Let $r_t \iid \mbox{GPD}(0, \sigma_r, \xi_r)$

\subsubsection{Power law random effect}
Let $r_t \iid \mbox{GPD}(\sigma_r/\xi_r, \sigma_r, \xi_r)$. Then by marginalizing out the $r_t$ terms, we find that 
\beq
	f_Y(y) = \frac{ 1 }{ \sigma_r }\left( \frac{ \xi_r }{ \sigma_r } \right)^{ -1/\xi_r - 1 } (2\pi)^{ -\frac{ n }{ 2 } }\beta^{ -\left( \frac{ 1 }{ \xi_r } + \frac{ n }{ 2 } \right) } \gamma\left( \frac{ 1 }{ \xi_r } + \frac{ n }{ 2 }, \frac{ \xi_r }{ \sigma_r } \beta \right)
\eeq
where $\beta = \frac{ 1 }{ 2 } (\bY - \bmu)^T \Sigma^{-1} (\bY - \bmu)$ and $\gamma(s, x)$ is the lower incomplete gamma function. 
We also find that marginally, 
\beqn
	f(y) = \frac{ 1 }{ \sigma_r }\left( \frac{ \xi_r }{ \sigma_r } \right)^{ -1/\xi_r - 1 } (2\pi)^{ -\frac{ 1 }{ 2 } }\beta^{ -\left( \frac{ 1 }{ \xi_r } + \frac{ 1 }{ 2 } \right) } \gamma\left( \frac{ 1 }{ \xi_r } + \frac{ 1 }{ 2 }, \frac{ \xi_r }{ \sigma_r } \beta \right)
\eeqn
where $\beta = \frac{ (y - \mu)^2 }{ 2 }$.

\subsubsection{Gamma random effect}
	Let $r_t \iid \mbox{IG}(\xi_r, \sigma_r)$. Then after marginalizing out the random effect, we find that $Y$ follows a multivariate T distribution with $2 \xi_r$ degrees of freedom, $\mu = \mu(\bs)$, and $\widehat{\Sigma} = \frac{ \sigma_r }{ \xi_r }{ \Sigma }$ where $\Sigma$ is the correlation matrix from (\ref{eq:singleday}). 
Then, for a single location 
\beqn\label{marg}
	P[Y_t(\bs) > y] &=& 1 - F(y)
\eeqn
where $F(y)$ is the distribution function for a t-distribution with $2 \xi_r$ degrees of freedom and $\mu = \mu$.
The joint is 
\beqn\label{joint}
	P[Y_t(\bs_1) > y, Y_t(\bs_2) > y] &=& 1 - F(y) - F(y) + F_{12}(y) 
\eeqn
where $F_{12}(y)$ is the multivariate t-distribution described above.
In this model,
\beqn\label{chi}
	P[Y_t(\bs_1) > y | Y_t(\bs_2) > y] &=& \frac{ 1 - F(y) - F(y) + F_{12}(y) }{ 1 - F(y) }\nonumber\\
	&=& 1 - \frac{ F(y) - F_{12}(y) }{ 1 - F(y) } = \chi(h)
\eeqn
where $h = ||\bs_1 - \bs_2||$. It's not entirely obvious here, but the $h$ comes in from the $\widehat{\Sigma}$ in the t-distribution.
%From here on out is from the old version of the paper.
%Then if $\mu_W$ and $\sigma_W$ satisfy $E[\exp(W_t(\bs)] = \exp(\mu_W + \sigma_W^2/2) = 1$,
%\beqn\label{marg}
%  P[Z_t(\bs)>z] &=& P\left\{r_t>z\exp[-W_t(\bs)]\right\}\\
%  &=& \mbox{E}\{\exp[W_t(s)]\}/z = 1/z,\nonumber
%\eeqn
%so $Z_t(\bs)$ has GPD(1,1,1) marginal.
%
%The joint is
%\beqn\label{joint}
%  P[Z_t(\bs_1)>z,Z_t(\bs_2)>z] &=& P\left\{r_t>z\exp[-W_t(\bs_1)],r_t>z\exp[-W_t(\bs_2)]\right\}\\
%  &=& P\left\{r_t>z\exp[-\min\{W_t(\bs_1),W_t(\bs_2)\}]\right\}\nonumber\\
%  &=& \mbox{E}\{\exp[\min\{W_t(\bs_1),W_t(\bs_2)\}]\}/z = \chi(||\bs_1-\bs_2||)/z,\nonumber
%\eeqn
%where $\chi(||\bs_1-\bs_2||) = \mbox{E}\{\exp[\min\{W_t(\bs_1),W_t(\bs_2)\}]\}$.  In this model,
%\beq\label{chi}
%  P[Z_t(\bs_1)>z|Z_t(\bs_2)>z] = \frac{P[Z_t(\bs_1)>z,Z_t(\bs_2)>z]}{P[Z_t(\bs_2)>z]} = \chi(||\bs_1-\bs_2||).
%\eeq
%We'll have to work it out, but I'm guessing this is the Schlather model's extremal coefficient.  

\subsubsection{Addressing long-range dependence}
In (\ref{chi}), $\chi(h)$ does not decrease to zero as distance $h=||\bs_1-\bs_2||$ goes to infinity. 
This is because all spatial locations share a common random effect $r_t$, and thus locations separated by very large distances are still dependent.  
%Can we address this as in the Brown-Resnick version of the max-stable process?  Let's start by ignoring this, but keep in the back of our minds.

To remove long-range dependence, we allow the random effect $r_t$ to vary by subregion.  
We partition the spatial domain into $J$ subregions and allow the random effects to vary by subregion, $r_t(\bs) = r_{tj}$ if $\bs$ is in subregion $j$. 
The partitioning is treated as a random process.
Let $\bv_{t1},...,\bv_{tJ}$ be spatial knots and assign
\beqn\label{partition}
  r_t(\bs) = r_{tj} \mbox{\ \ if \ \ } j=\argmin_l\{||\bs-\bv_{tl}||\}.
\eeqn 
The knots $\bv_{t1},...,\bv_{tJ}$ are given a homogeneous Poisson process and $r_{tj}\iid \mbox{IG}(\xi_r, \sigma_r)$.
Denote $\pi(h)$ as the probability that two points separated by distance $h$ are in the same subregion.

Given that two locations are in the same subdomain, their joint survival function is (\ref{marg}).  If not,
\beqn\label{marg2}
  P[Y_t(\bs_1) > y, Y_t(\bs_2) > y ] &=& [1 - F(y)][ 1 - F(y)].
\eeqn
Combining both cases gives
\beqn\label{chi2}
  P[Y_t(\bs_1)>y|Y_t(\bs_2)>y] = [1 - \pi(h)][1 - F(y)] + \pi(h)\left[1 - \frac{ F(y) - F_{12}(y) }{ 1 - F(y) } \right].
\eeqn
Then
\beqn\label{chi2}
  \lim_{y\rightarrow \infty}P[Y_t(\bs_1)>y|Y_t(\bs_2)>y] = \pi(h) \chi(h),
\eeqn
and 
\beqn\label{chi3}
  \lim_{y\rightarrow \infty , h\rightarrow \infty}P[Y_t(\bs_1)>y|Y_t(\bs_2)>y] = 0 
\eeqn
if $\lim_{h\rightarrow \infty}\pi(h) = 0 $. 
A proof of this is given in Appendix A.1
Under this model, (\ref{chi2}) shows there is asymptotic dependence, and (\ref{chi3}) shows that long-range spatial dependence is eliminated.


%\subsection{Complete data - general GPD marginals}\label{s:model2}
%Returning to the complete data, $Y_t(\bs)$, if $Z_t(\bs)\sim$ GPD(1,1,1), then let $Y_t(\bs) = \mu(\bs) + \sigma g_{\xi}[Z_t(\bs)]$, where $g_{\xi}(z) = (z^{\xi}-1)/\xi$.  After this transformation, the marginal distribution of $Y_t$ is
%\beqn\label{Ymarg}
%	Y_t(\bs) &\sim& \mbox{GPD}[\mu^*(\bs),\sigma^*(\bs),\xi]
%\eeqn
%where $\mu(\bs)^* = \mu(\bs) + \frac{ \sigma }{ \xi } \{ \exp[\xi W_t(\bs)] - 1 \}$ and $\sigma(\bs)^* = \sigma \exp[ \xi W_t(\bs) ]$. This has the same tail dependence as in (\ref{chi}).  So, the final complete data model is
%\beqn\label{Y}
%  Y_t(\bs) &=& \mu(\bs) + \sigma g_{\xi}\left\{r_t\exp[W_t(\bs)]\right\}\\
%  r_t      &\iid&  \mbox{GPD}(1,1,1)\nonumber\\
%  W_t      &\iid&  \mbox{Mat}(\mu_W,\sigma_W,\rho,\nu, \alpha).\nonumber
%\eeqn
%Conditioned on the daily effect $r_t$, this model is essentially a Gaussian copula.  However, asymptotic dependence is induced by marginalizing over $r_t$.

%\textbf{Hierarchical:}
%Would it be worth exploring something like this?
%If $Z_t(\bs) \sim \mbox{GPD}(1,1,1)$ marginally, then $Y_t(\bs) = \mu(\bs) + \frac{\sigma}{\xi}[Z^(\bs)^\xi - 1]$ is marginally distributed as $\mbox{GPD}[\mu(\bs),\sigma,\xi]$. By defining $Z_t(\bs) = r_t \exp[ W_t(\bs)]$ the $Y$ terms exhibit the same tail dependence as in (\ref{chi}).  After marginalizing over $r_t$, this gives the hierarchical model
%\beqn\label{Y}
%  Y_t(\bs) | W_t(\bs) &\iid& \mbox{GPD}(\mu(\bs)^*, \sigma(\bs)^*, \xi)\\
%  W_t      &\iid&  \mbox{Mat}(\mu_W,\sigma_W,\rho,\nu, \alpha).\nonumber
%\eeqn
%where $\mu(\bs)^* = \mu(\bs) + \frac{ \sigma }{ \xi } \{ \exp[\xi W_t(\bs)] - 1 \}$ and $\sigma(\bs)^* = \sigma \exp[ \xi W_t(\bs) ]$.
%I think this would be an equivalent model to one where we have a different $r_t$ for each site.
%My hunch would be that this would suffer from some kind of asymptotic independence, but would something like this be helpful when dealing with the long-range dependence?


\subsection{Points above a threshold model}\label{s:pot}

So, considering (\ref{fullmodel}), the marginal distribution for the thresholded observations $\Yt_t(\bs)$ is  
\beqn\label{Ytmarg}
  P(\Yt_t(\bs) = T) &=& \Phi\left(\frac{ T - \mu(\bs) }{r_t \sigma_y} \right)\\
  P(\Yt_t(\bs)|\Yt_t(\bs) > T)  &=& \frac{ 1 - \Phi\left(\frac{ Y_t(\bs) - \mu(\bs) }{r_t \sigma_y} \right) }{ 1 - \Phi\left(\frac{ T - \mu(\bs) }{r_t \sigma_y} \right) }\nonumber
\eeqn
where $\Phi(\bZ(\bs))$ is a multivariate normal distribution that follows $\mbox{Mat}(0,\sigma_W,\rho,\nu, \alpha)$. The tail dependence is the same as in (\ref{chi}).  By allowing the location parameter $\mu(\bs)$ to vary spatially, the marginal probability of threshold exceedance vary by location.  We let $\mu(\bs)$ be a Gaussian process, or maybe just a function of spatial covariates like CMAQ?  We could also let $\sigma$ vary spatially to get an even more flexible model.


\section{Computation}\label{s:comp}

There are three main parts to the MCMC: (1) imputing the thresholded $Y_t(\bs)$; (2) updating the model parameters $\Theta=\{r_1,\ldots,r_m,\mu(\bs),\sigma_Y,\rho, \nu, \alpha, \mu_r, \sigma_r, \xi_r\}$; and (3) making spatial predictions.  All three require the joint distribution for complete data given $\Theta$.  Denote $\bY_t = [Y_t(\bs_1),...,Y_t(\bs_n)]^T$, and $Y_t(\bs)=G_{\Theta}[W_t(\bs)]$ defined by solving (\ref{fullmodel}).  The distribution of $Y_t|\Theta$ is the usual multivariate normal distribution
\beqn\label{copula}
   f(Y_t|\Theta) = \phi_n[Y_t|\mu(\bs),\sigma_Y,\rho,\nu_Y, r_1, \ldots, r_m]
\eeqn
where $\phi_n$ is the multivariate normal pdf with $\Matern$ correlation matrix.

\subsection{Imputation}\label{s:impute}

We can use Gibbs sampling to update $Y_t(\bs)$ for observations with $\Yt_t(\bs)=T$.
Given $\Theta$, $Y_t(\bs)$ has truncated normal full conditional with these parameter values.
So we sample $Y_t(\bs)\sim\mbox{TN}_{(-\infty,T)}(a,b^2)$.
Imputation of the $Y_t(\bs)$ terms is done based upon the conditional multivariate normal. 
If we consider the full distribution of $Y_t(\bs)$, then
\beqn
	Y_t(\bs) \sim \mbox{Mat}(\mu(\bs), \sigma_Y, \rho, \nu, \alpha).
\eeqn
where $\sigma_Y = r_t * \sigma_W$.
So, to impute at each location below the threshold, the standard multivariate normal theory holds. 
So, conditional upon all other locations, at location $i$
\beqn
	Y_t(\bs_{(i)}) | Y_t(\bs_{(-i)}) \sim \mbox{TN}(a, b^2)
\eeqn
where
\beqn
	a 	&=& \mu_Y - \Sigma_{12} \Sigma^{-1}_{22}(y_{(-i)}(\bs) - \mu_Y)\\
	b^2	&=& \Sigma_{11} - \Sigma_{12}\Sigma^{-1}_{22}\Sigma_{21}.
\eeqn


\subsection{Parameter updates}
To update $\Theta$ given the current value of the compute data $\bY_1,...,\bY_{n_t}$, we use a standard Metropolis update using the likelihood (\ref{copula}).  

\subsection{Spatial prediction}
Given $\bY_t$ the usual Kriging equations give the predictive distribution for $Y_t(\bs^*)$ at prediction location $\bs^*$. 


\section{Simulation study}\label{s:sim}
Quantile scores. $T=0.8$ is data that has been thresholded at the 80th sample quantile.
\begin{table}[htdp]
\caption{Quantile Scores for $\alpha=.5$ settings}
\begin{center}
\begin{tabular}{r|rrr|rrr|rrr|}
	\hline
	&&	Setting 1&	&	&Setting 2	&	&	&Setting 3	&	\\
	\hline
	&$\xi=-0.25$	&	&$T=0.8$& $\xi=0.25$ & & $T=0.8$& $\xi=-0.25$&&$T=0.9$\\
	\hline
	&GPD& Gamma & Fixed & GPD & Gamma & Fixed & GPD &Gamma & Fixed\\
	\hline
	0.900 & 0.182 & 0.181 & 0.182	&0.271 & 0.270 & 0.280	& 0.121 & 0.121 & 0.127\\
	\hline
	0.950 & 0.131 & 0.131 & 0.133	&0.202 & 0.202 & 0.202	& 0.095 & 0.095 & 0.095\\
	\hline
	0.970	&	0.098	&	0.098	&	0.105	&	0.154	&	0.154	&	0.157	&	0.075	&	0.074	&	0.077\\
	\hline
0.990	&	0.050	&	0.049	&	0.066	&	0.077	&	0.077	&	0.092	&	0.039	&	0.038	&	0.050\\
	\hline
0.995	&	0.032	&	0.031	&	0.051	&	0.048	&	0.048	&	0.067	&	0.025	&	0.024	&	0.040\\
	\hline
0.999	&	0.012	&	0.011	&	0.032	&	0.015	&	0.015	&	0.037	&	0.008	&	0.008	&	0.026\\
	\hline
\end{tabular}
\end{center}
\label{lbl:quantsim5}
\end{table}%

\begin{table}[htdp]
\caption{Quantile Scores for $\alpha=.9$ settings}
\begin{center}
\begin{tabular}{r|rrr|rrr|rrr|}
	\hline
	&&	Setting 4&	&	&Setting 5	&	&	&Setting 6	&	\\
	\hline
	&$\xi=-0.25$	&	&$T=0.8$& $\xi=0.25$ & & $T=0.8$& $\xi=-0.25$&&$T=0.9$\\
	\hline
	&GPD& Gamma & Fixed & GPD & Gamma & Fixed & GPD &Gamma & Fixed\\
	\hline
0.900	&	0.179	&	0.179	&	0.180	&	0.229	&	0.229	&	0.231	&	0.117	&	0.117	&	0.122\\
	\hline
0.950	&	0.128	&	0.128	&	0.129	&	0.163	&	0.163	&	0.164	&	0.092	&	0.092	&	0.091\\
	\hline
0.970	&	0.095	&	0.095	&	0.098	&	0.119	&	0.119	&	0.123	&	0.071	&	0.071	&	0.072\\
	\hline
0.990	&	0.046	&	0.045	&	0.054	&	0.055	&	0.054	&	0.064	&	0.036	&	0.035	&	0.043\\
	\hline
0.995	&	0.027	&	0.026	&	0.038	&	0.032	&	0.032	&	0.043	&	0.022	&	0.022	&	0.032\\
	\hline
0.999	&	0.008	&	0.008	&	0.019	&	0.008	&	0.008	&	0.021	&	0.007	&	0.006	&	0.019\\
	\hline
\end{tabular}
\end{center}
\label{lbl:quantsim9}
\end{table}%




\section{Data analysis}\label{s:analysis}
Posteriors of $\beta$ terms. $\beta_0$ is intercept, $\beta_1$ is x coordinate, $\beta_2$ is y coordinate, $\beta_3$ is CMAC coordinate.
\begin{table}[htdp]
\caption{Threshold: 0.80 - GPD - Beta Posteriors}
\begin{center}
\begin{tabular}{|c|rrr|rrr|rrr|}
	 &\multicolumn{3}{c|}{GPD}& \multicolumn{3}{c|}{Gamma}& \multicolumn{3}{c|}{Fixed}\\
	\hline
				& 0.025		&	0.500	&	0.975& 0.025		&	0.500	&	0.975& 0.025		&	0.500	&	0.975\\
	\hline
	\hline
	$\beta_0$	& 17.885	&	27.317	&	34.651 & 16.446	&	27.187	&	35.351& 9.801		&	17.565	&	24.322\\
	\hline
	$\beta_1$	& -4.000	&	-1.352	&	1.209& -3.993	&	-1.380	&	1.123& -3.894	&	-1.240	&	1.152\\
	\hline
	$\beta_2$	& -1.092	&	0.694	&	2.516& -1.017	&	0.720	&	2.516& -1.269	&	0.975	&	3.064\\
	\hline
	$\beta_3$	& 0.456		&	0.569	&	0.707& 0.444		&	0.570	&	0.727& 0.631		&	0.726	&	0.834\\
	\hline	
\end{tabular}
\end{center}
\label{lbl:beta80}
\end{table}%

\begin{table}[htdp]
\caption{Threshold: 0.90 - GPD - Beta Posteriors}
\begin{center}
\begin{tabular}{|c|rrr|rrr|rrr|}
	 &\multicolumn{3}{c|}{GPD}& \multicolumn{3}{c|}{Gamma}& \multicolumn{3}{c|}{Fixed}\\
	\hline
			& 0.025		&	0.500	&	0.975& 0.025		&	0.500	&	0.975& 0.025		&	0.500	&	0.975\\
	\hline
	\hline
	$\beta_0$	& 11.377	&	24.969	&	35.848& 12.893	&	25.372	&	37.379& -3.209	&	11.777	&	22.724\\
	\hline
	$\beta_1$	& -4.299	&	0.060	&	5.691& -4.513	&	-0.171	&	4.965& -4.983	&	-0.801	&	4.534\\
	\hline
	$\beta_2$	& -6.198	&	-2.082	&	0.963& -5.644	&	-1.789	&	1.288& -6.164	&	-2.269	&	1.175\\
	\hline
	$\beta_3$	& 0.514		&	0.663	&	0.829& 0.492		&	0.649	&	0.825& 0.696		&	0.845	&	1.028\\
	\hline	
\end{tabular}
\end{center}
\label{lbl:beta90}
\end{table}%


\begin{table}[htdp]
\caption{Threshold: 0.95 - GPD - Beta Posteriors}
\begin{center}
\begin{tabular}{|c|rrr|rrr|rrr|}
	 &\multicolumn{3}{c|}{GPD}& \multicolumn{3}{c|}{Gamma}& \multicolumn{3}{c|}{Fixed}\\
	\hline
			& 0.025		&	0.500	&	0.975& 0.025		&	0.500	&	0.975& 0.025		&	0.500	&	0.975\\
	\hline
	\hline
	$\beta_0$	& -21.936	&	16.245	&	29.874& -14.792	&	11.460	&	32.541& -47.356	&	-8.725	&	12.051\\
	\hline
	$\beta_1$	& 2.948		&	12.280	&	26.863& 3.497		&	13.160	&	27.162& 2.788		&	13.244	&	29.326\\
	\hline
	$\beta_2$	& -14.984	&	-8.785	&	-0.523& -17.402	&	-8.016	&	0.920& -19.640	&	-9.499	&	-0.563\\
	\hline
	$\beta_3$	& 0.599		&	0.773	&	1.164& 0.567		&	0.807	&	1.073& 0.807		&	1.057	&	1.454\\
	\hline	
\end{tabular}
\end{center}
\label{lbl:beta95}
\end{table}%


Table \ref{lbl:quant-5fold} has quantile scores. $T=0.8$ is data that has been thresholded at the 80th sample quantile.
\begin{table}[htdp]
\caption{Quantile Scores for 5-fold cross-validation}
\begin{center}
\begin{tabular}{r|rrr|rrr|rrr|}
	\hline
	&&	$T=0.80$&	&	&$T=0.95$	&	&	&$T=0.99$	&	\\
	\hline
			&GPD& Gamma & Fixed & GPD & Gamma & Fixed & GPD &Gamma & Fixed\\
	\hline
	0.900	&4.870	&4.870	&4.842	&4.925	&4.914	&4.900	&5.501	&5.229	&5.629 \\
	\hline
	0.950	&2.861	&2.861	&2.837	&2.871	&2.865	&2.861	&3.244	&3.077	&3.290 \\
	\hline
	0.970	&1.891	&1.890	&1.881	&1.898	&1.896	&1.891	&2.155	&2.047	&2.175 \\
	\hline
	0.990	&0.742	&0.742	&0.743	&0.749	&0.748	&0.741	&0.853	&0.816	&0.855 \\
	\hline
	0.995	&0.401	&0.400	&0.403	&0.405	&0.404	&0.399	&0.461	&0.442	&0.464 \\
	\hline
	0.999	&0.092	&0.093	&0.093	&0.096	&0.096	&0.094	&0.102	&0.096	&0.106 \\
	\hline
\end{tabular}
\end{center}
\label{lbl:quant-5fold}
\end{table}%

\section{Conclusions}\label{s:con}

\section*{Acknowledgments}

%\section*{Appendix A.1: Derivation of $\tilde{\sigma}(\bs)$}
%Over the threshold, $\Yt_t(\bs) = Y_t(\bs)$. So, to derive the parameters for the density of $\Yt(\bs)$ we need to consider the truncated likelihood of $Y(\bs)$. So
%\begin{align*}
%	F(\Yt(\bs) | \Yt(\bs) > T) &= \frac{ \left[ 1 + \frac{ \xi }{ \sigma^*(\bs)} (Y(\bs)- \mu^*(\bs)) \right]^{-1/\xi} }{ \left[ 1 + \frac{ \xi}{\sigma^*(\bs)} (T - \mu^*(\bs) ) \right]^{-1/\xi}} = \left[ \frac{ 1 + \frac{ \xi }{ \sigma^*(\bs) }(Y(\bs) - \mu^*(\bs) )}{1 + \frac{ \xi }{ \sigma^*(\bs) } (T - \mu^*(\bs) ) }\right]^{-1/\xi}\\
%		&= \left[ \frac{ 1 + \frac{\xi}{\sigma^*(\bs)} (Y(\bs) - T + T - \mu^*(\bs) ) }{ 1 + \frac{ \xi}{\sigma^*(\bs)}(T - \mu^*(\bs) ) } \right]^{-1/\xi}\\
%		&= \left[ 1 + \frac{ \frac{ \xi }{ \sigma^*(\bs)} (Y(\bs) - T ) }{ 1 + \frac{ \xi }{ \sigma^*(\bs) }(T - \mu^*(\bs) )} \right]^{ -1/\xi}\\
%		&= \left[ 1 + \frac{ \xi }{ \tilde{\sigma}(\bs) } (Y(\bs) - T) \right]^{ -1/\xi}
%\end{align*}
%where 
%\begin{align*}
%	\tilde{\sigma}(\bs) = \frac{ \sigma^*(\bs) }{ 1 + \frac{ \xi }{ \sigma^*(\bs) } (T - \mu^*(\bs) ) } = \sigma^*(\bs) \theta(\bs)^{ \xi }.
%\end{align*}
\newpage
\section*{Appendix A.1: Proof that $\lim_{h \rightarrow \infty} \pi(h) = 0$}
Consider two points $\bs_1, \bs_2 \in \calD \subset \calR^2$ with $h = || \bs_1 - \bs_2 ||$, a set of random knots $\bv_1, \ldots, \bv_J$.
Define $g(\bs_i) = \argmin_j ||\bs_i - \bv_j ||$ to be the partition in which $\bs_i$ resides. 
Let $\pi(h) = \Pr[ g(\bs_1) = g(\bs_2) ]$. Then $\pi(h)$ is the probability that $\bv_j$ is the only knot in $(\ballunion \cap \calD)$ where $B_r(\bs)$ is a ball of radius $r > 0$ centered at point $\bs$.
Assume that $g(\bs_1) = g(\bs_2) = j$, and let $a = ||\bs_1 - \bv_j ||$ and $b = ||\bs_2 - \bv_j ||$. 
Then
\beqn
	\pi(h) = \Pr\left[\bv_2, \ldots, \bv_J \in (\ballunion)^C\right ] = \left( 1 - \frac{ | \ballunion | }{ | \calD |} \right)^{J - 1}
\eeqn
where $| \calS |$ is the area of $\calS \cap \calD$.
We know that $\max( |\ballunion |)$ occurs when $B_a(\bs_1) \cap B_b(\bs_2) = 0$ and $\min( | \ballunion | )$ occurs when either $B_a(\bs_1) \subset B_b(\bs_2)$ or $B_b(\bs_2) \subset B_a(\bs_1)$.
Therefore
\beqn
	\left( 1 - \frac{ 2 \pi ( a^2 + b^2) }{ | \calD |} \right)^{J - 1} \le \pi(h) \le \left( 1 - \frac{ 2 \pi \max(a^2,b^2) }{ | \calD |} \right)^{J - 1}.
\eeqn
Then by the triangle inequality, $h \le a + b$, so if $h \rightarrow \infty$, then $\max(a, b) \rightarrow \infty$.
Then because $h \rightarrow \infty$ then $\max(a^2, b^2) \rightarrow \infty$ and 
\beqn
	\lim_{h \rightarrow \infty} \pi(h) \le \lim_{\max(a, b) \rightarrow \infty} \left( 1 - \frac{ 2 \pi \max(a^2, b^2) }{ | \calD |}\right)^{ J - 1 } = \left( 1 - \frac{ | \calD |}{ | \calD |} \right)^{ J - 1 } = 0.
\eeqn
Therefore $\lim_{h \rightarrow \infty} \pi(h) = 0$.

\section*{Appendix A.2: MCMC Details}

\subsection*{Priors}
For a given day
\beqn
	r_{j} &\iid& \mbox{IG}(\xi_r, \sigma_r)\nonumber\\ 
	\sigma_r &\sim& \mbox{Gamma}(0.1, 0.1)\nonumber\\
	\xi_r &\sim& \mbox{Discrete Uniform}(0.5, 30)\nonumber\\
	\bv_{j} &\iid& \mbox{Uniform}(\calD)\nonumber\\
	\mu(\bs) &\sim& \mbox{MVN}(0, \mbox{diag}(10))\nonumber\\
	\log(\rho) &\sim& \mbox{N}(0, 10)\nonumber\\
	\log(\nu) &\sim& \mbox{N}(-1, 1)\nonumber\\
	\alpha &\sim& \mbox{Unif}(0, 1)\nonumber
\eeqn
where $v_j$ are the locations of the spatial knots over $\calD$, $\alpha$ is a parameter controlling the proportion of $r^2_{j}$ that is attributed to the nugget and partial sill. 
If $\alpha = 0$, then $r^2_{j}$ can be entirely attributed to the nugget effect, and if $\alpha = 1$, then $r^2_{tj}$ can be entirely attributed to the partial sill.
We use Gibbs sampling for $r_j, \sigma_r$, and $\mu(\bs)$. 
All other parameters are sampled using a random-walk Metropolis Hastings algorithm.

\bibliographystyle{rss}
\bibliography{Schlather}

\end{document}

